Hotel%20booking%20prediction%20title.png

In [1]:
# importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

import folium
from folium.plugins import HeatMap
import plotly.express as px

plt.style.use('fivethirtyeight')
%matplotlib inline
pd.set_option('display.max_columns', 32)
In [2]:
# reading data

df = pd.read_csv("C:/Users/VANAM GANESH/Downloads/hotel_bookings.csv/hotel_bookings.csv")
df.head()
Out[2]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies meal country market_segment distribution_channel is_repeated_guest previous_cancellations previous_bookings_not_canceled reserved_room_type assigned_room_type booking_changes deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct 0 0 0 C C 3 No Deposit NaN NaN 0 Transient 0.0 0 0 Check-Out 2015-07-01
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct 0 0 0 C C 4 No Deposit NaN NaN 0 Transient 0.0 0 0 Check-Out 2015-07-01
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 0.0 0 BB GBR Direct Direct 0 0 0 A C 0 No Deposit NaN NaN 0 Transient 75.0 0 0 Check-Out 2015-07-02
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 0.0 0 BB GBR Corporate Corporate 0 0 0 A A 0 No Deposit 304.0 NaN 0 Transient 75.0 0 0 Check-Out 2015-07-02
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 0.0 0 BB GBR Online TA TA/TO 0 0 0 A A 0 No Deposit 240.0 NaN 0 Transient 98.0 0 1 Check-Out 2015-07-03
In [3]:
df.describe()
Out[3]:
is_canceled lead_time arrival_date_year arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies is_repeated_guest previous_cancellations previous_bookings_not_canceled booking_changes agent company days_in_waiting_list adr required_car_parking_spaces total_of_special_requests
count 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 119386.000000 119390.000000 119390.000000 119390.000000 119390.000000 119390.000000 103050.000000 6797.000000 119390.000000 119390.000000 119390.000000 119390.000000
mean 0.370416 104.011416 2016.156554 27.165173 15.798241 0.927599 2.500302 1.856403 0.103890 0.007949 0.031912 0.087118 0.137097 0.221124 86.693382 189.266735 2.321149 101.831122 0.062518 0.571363
std 0.482918 106.863097 0.707476 13.605138 8.780829 0.998613 1.908286 0.579261 0.398561 0.097436 0.175767 0.844336 1.497437 0.652306 110.774548 131.655015 17.594721 50.535790 0.245291 0.792798
min 0.000000 0.000000 2015.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 6.000000 0.000000 -6.380000 0.000000 0.000000
25% 0.000000 18.000000 2016.000000 16.000000 8.000000 0.000000 1.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 9.000000 62.000000 0.000000 69.290000 0.000000 0.000000
50% 0.000000 69.000000 2016.000000 28.000000 16.000000 1.000000 2.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 14.000000 179.000000 0.000000 94.575000 0.000000 0.000000
75% 1.000000 160.000000 2017.000000 38.000000 23.000000 2.000000 3.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 229.000000 270.000000 0.000000 126.000000 0.000000 1.000000
max 1.000000 737.000000 2017.000000 53.000000 31.000000 19.000000 50.000000 55.000000 10.000000 10.000000 1.000000 26.000000 72.000000 21.000000 535.000000 543.000000 391.000000 5400.000000 8.000000 5.000000
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal                            119390 non-null  object 
 13  country                         118902 non-null  object 
 14  market_segment                  119390 non-null  object 
 15  distribution_channel            119390 non-null  object 
 16  is_repeated_guest               119390 non-null  int64  
 17  previous_cancellations          119390 non-null  int64  
 18  previous_bookings_not_canceled  119390 non-null  int64  
 19  reserved_room_type              119390 non-null  object 
 20  assigned_room_type              119390 non-null  object 
 21  booking_changes                 119390 non-null  int64  
 22  deposit_type                    119390 non-null  object 
 23  agent                           103050 non-null  float64
 24  company                         6797 non-null    float64
 25  days_in_waiting_list            119390 non-null  int64  
 26  customer_type                   119390 non-null  object 
 27  adr                             119390 non-null  float64
 28  required_car_parking_spaces     119390 non-null  int64  
 29  total_of_special_requests       119390 non-null  int64  
 30  reservation_status              119390 non-null  object 
 31  reservation_status_date         119390 non-null  object 
dtypes: float64(4), int64(16), object(12)
memory usage: 29.1+ MB
In [5]:
# checking for null values 

null = pd.DataFrame({'Null Values' : df.isna().sum(), 'Percentage Null Values' : (df.isna().sum()) / (df.shape[0]) * (100)})
null
Out[5]:
Null Values Percentage Null Values
hotel 0 0.000000
is_canceled 0 0.000000
lead_time 0 0.000000
arrival_date_year 0 0.000000
arrival_date_month 0 0.000000
arrival_date_week_number 0 0.000000
arrival_date_day_of_month 0 0.000000
stays_in_weekend_nights 0 0.000000
stays_in_week_nights 0 0.000000
adults 0 0.000000
children 4 0.003350
babies 0 0.000000
meal 0 0.000000
country 488 0.408744
market_segment 0 0.000000
distribution_channel 0 0.000000
is_repeated_guest 0 0.000000
previous_cancellations 0 0.000000
previous_bookings_not_canceled 0 0.000000
reserved_room_type 0 0.000000
assigned_room_type 0 0.000000
booking_changes 0 0.000000
deposit_type 0 0.000000
agent 16340 13.686238
company 112593 94.306893
days_in_waiting_list 0 0.000000
customer_type 0 0.000000
adr 0 0.000000
required_car_parking_spaces 0 0.000000
total_of_special_requests 0 0.000000
reservation_status 0 0.000000
reservation_status_date 0 0.000000
In [6]:
# filling null values with zero

df.fillna(0, inplace = True)
In [7]:
# visualizing null values

msno.bar(df)
plt.show()
In [8]:
# adults, babies and children cant be zero at same time, so dropping the rows having all these zero at same time

filter = (df.children == 0) & (df.adults == 0) & (df.babies == 0)
df[filter]
Out[8]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies meal country market_segment distribution_channel is_repeated_guest previous_cancellations previous_bookings_not_canceled reserved_room_type assigned_room_type booking_changes deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
2224 Resort Hotel 0 1 2015 October 41 6 0 3 0 0.0 0 SC PRT Corporate Corporate 0 0 0 A I 1 No Deposit 0.0 174.0 0 Transient-Party 0.00 0 0 Check-Out 2015-10-06
2409 Resort Hotel 0 0 2015 October 42 12 0 0 0 0.0 0 SC PRT Corporate Corporate 0 0 0 A I 0 No Deposit 0.0 174.0 0 Transient 0.00 0 0 Check-Out 2015-10-12
3181 Resort Hotel 0 36 2015 November 47 20 1 2 0 0.0 0 SC ESP Groups TA/TO 0 0 0 A C 0 No Deposit 38.0 0.0 0 Transient-Party 0.00 0 0 Check-Out 2015-11-23
3684 Resort Hotel 0 165 2015 December 53 30 1 4 0 0.0 0 SC PRT Groups TA/TO 0 0 0 A A 1 No Deposit 308.0 0.0 122 Transient-Party 0.00 0 0 Check-Out 2016-01-04
3708 Resort Hotel 0 165 2015 December 53 30 2 4 0 0.0 0 SC PRT Groups TA/TO 0 0 0 A C 1 No Deposit 308.0 0.0 122 Transient-Party 0.00 0 0 Check-Out 2016-01-05
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
115029 City Hotel 0 107 2017 June 26 27 0 3 0 0.0 0 BB CHE Online TA TA/TO 0 0 0 A A 1 No Deposit 7.0 0.0 0 Transient 100.80 0 0 Check-Out 2017-06-30
115091 City Hotel 0 1 2017 June 26 30 0 1 0 0.0 0 SC PRT Complementary Direct 0 0 0 E K 0 No Deposit 0.0 0.0 0 Transient 0.00 1 1 Check-Out 2017-07-01
116251 City Hotel 0 44 2017 July 28 15 1 1 0 0.0 0 SC SWE Online TA TA/TO 0 0 0 A K 2 No Deposit 425.0 0.0 0 Transient 73.80 0 0 Check-Out 2017-07-17
116534 City Hotel 0 2 2017 July 28 15 2 5 0 0.0 0 SC RUS Online TA TA/TO 0 0 0 A K 1 No Deposit 9.0 0.0 0 Transient-Party 22.86 0 1 Check-Out 2017-07-22
117087 City Hotel 0 170 2017 July 30 27 0 2 0 0.0 0 BB BRA Offline TA/TO TA/TO 0 0 0 A A 0 No Deposit 52.0 0.0 0 Transient 0.00 0 0 Check-Out 2017-07-29

180 rows × 32 columns

In [9]:
df = df[~filter]
df
Out[9]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies meal country market_segment distribution_channel is_repeated_guest previous_cancellations previous_bookings_not_canceled reserved_room_type assigned_room_type booking_changes deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct 0 0 0 C C 3 No Deposit 0.0 0.0 0 Transient 0.00 0 0 Check-Out 2015-07-01
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct 0 0 0 C C 4 No Deposit 0.0 0.0 0 Transient 0.00 0 0 Check-Out 2015-07-01
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 0.0 0 BB GBR Direct Direct 0 0 0 A C 0 No Deposit 0.0 0.0 0 Transient 75.00 0 0 Check-Out 2015-07-02
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 0.0 0 BB GBR Corporate Corporate 0 0 0 A A 0 No Deposit 304.0 0.0 0 Transient 75.00 0 0 Check-Out 2015-07-02
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 0.0 0 BB GBR Online TA TA/TO 0 0 0 A A 0 No Deposit 240.0 0.0 0 Transient 98.00 0 1 Check-Out 2015-07-03
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
119385 City Hotel 0 23 2017 August 35 30 2 5 2 0.0 0 BB BEL Offline TA/TO TA/TO 0 0 0 A A 0 No Deposit 394.0 0.0 0 Transient 96.14 0 0 Check-Out 2017-09-06
119386 City Hotel 0 102 2017 August 35 31 2 5 3 0.0 0 BB FRA Online TA TA/TO 0 0 0 E E 0 No Deposit 9.0 0.0 0 Transient 225.43 0 2 Check-Out 2017-09-07
119387 City Hotel 0 34 2017 August 35 31 2 5 2 0.0 0 BB DEU Online TA TA/TO 0 0 0 D D 0 No Deposit 9.0 0.0 0 Transient 157.71 0 4 Check-Out 2017-09-07
119388 City Hotel 0 109 2017 August 35 31 2 5 2 0.0 0 BB GBR Online TA TA/TO 0 0 0 A A 0 No Deposit 89.0 0.0 0 Transient 104.40 0 0 Check-Out 2017-09-07
119389 City Hotel 0 205 2017 August 35 29 2 7 2 0.0 0 HB DEU Online TA TA/TO 0 0 0 A A 0 No Deposit 9.0 0.0 0 Transient 151.20 0 2 Check-Out 2017-09-07

119210 rows × 32 columns

Hotel%20booking%20prediction%202.png

In [10]:
country_wise_guests = df[df['is_canceled'] == 0]['country'].value_counts().reset_index()
country_wise_guests.columns = ['country', 'No of guests']
country_wise_guests
Out[10]:
country No of guests
0 PRT 20977
1 GBR 9668
2 FRA 8468
3 ESP 6383
4 DEU 6067
... ... ...
161 BHR 1
162 DJI 1
163 MLI 1
164 NPL 1
165 FRO 1

166 rows × 2 columns

In [11]:
basemap = folium.Map()
guests_map = px.choropleth(country_wise_guests, locations = country_wise_guests['country'],
                           color = country_wise_guests['No of guests'], hover_name = country_wise_guests['country'])
guests_map.show()

Hotel%20booking%20prediction%203.png

In [12]:
df.head()
Out[12]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies meal country market_segment distribution_channel is_repeated_guest previous_cancellations previous_bookings_not_canceled reserved_room_type assigned_room_type booking_changes deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct 0 0 0 C C 3 No Deposit 0.0 0.0 0 Transient 0.0 0 0 Check-Out 2015-07-01
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct 0 0 0 C C 4 No Deposit 0.0 0.0 0 Transient 0.0 0 0 Check-Out 2015-07-01
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 0.0 0 BB GBR Direct Direct 0 0 0 A C 0 No Deposit 0.0 0.0 0 Transient 75.0 0 0 Check-Out 2015-07-02
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 0.0 0 BB GBR Corporate Corporate 0 0 0 A A 0 No Deposit 304.0 0.0 0 Transient 75.0 0 0 Check-Out 2015-07-02
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 0.0 0 BB GBR Online TA TA/TO 0 0 0 A A 0 No Deposit 240.0 0.0 0 Transient 98.0 0 1 Check-Out 2015-07-03

Hotel%20booking%20prediction%204.png

In [13]:
data = df[df['is_canceled'] == 0]

px.box(data_frame = data, x = 'reserved_room_type', y = 'adr', color = 'hotel', template = 'plotly_dark')

Hotel%20booking%20prediction%205.png

In [14]:
data_resort = df[(df['hotel'] == 'Resort Hotel') & (df['is_canceled'] == 0)]
data_city = df[(df['hotel'] == 'City Hotel') & (df['is_canceled'] == 0)]
In [15]:
resort_hotel = data_resort.groupby(['arrival_date_month'])['adr'].mean().reset_index()
resort_hotel
Out[15]:
arrival_date_month adr
0 April 75.867816
1 August 181.205892
2 December 68.410104
3 February 54.147478
4 January 48.761125
5 July 150.122528
6 June 107.974850
7 March 57.056838
8 May 76.657558
9 November 48.706289
10 October 61.775449
11 September 96.416860
In [16]:
city_hotel=data_city.groupby(['arrival_date_month'])['adr'].mean().reset_index()
city_hotel
Out[16]:
arrival_date_month adr
0 April 111.962267
1 August 118.674598
2 December 88.401855
3 February 86.520062
4 January 82.330983
5 July 115.818019
6 June 117.874360
7 March 90.658533
8 May 120.669827
9 November 86.946592
10 October 102.004672
11 September 112.776582
In [17]:
final_hotel = resort_hotel.merge(city_hotel, on = 'arrival_date_month')
final_hotel.columns = ['month', 'price_for_resort', 'price_for_city_hotel']
final_hotel
Out[17]:
month price_for_resort price_for_city_hotel
0 April 75.867816 111.962267
1 August 181.205892 118.674598
2 December 68.410104 88.401855
3 February 54.147478 86.520062
4 January 48.761125 82.330983
5 July 150.122528 115.818019
6 June 107.974850 117.874360
7 March 57.056838 90.658533
8 May 76.657558 120.669827
9 November 48.706289 86.946592
10 October 61.775449 102.004672
11 September 96.416860 112.776582

Hotel%20booking%20prediction%206.png

In [18]:
!pip install sort-dataframeby-monthorweek

!pip install sorted-months-weekdays
Requirement already satisfied: sort-dataframeby-monthorweek in c:\users\vanam ganesh\anaconda3\lib\site-packages (0.4)
Requirement already satisfied: sorted-months-weekdays in c:\users\vanam ganesh\anaconda3\lib\site-packages (0.2)
In [19]:
import sort_dataframeby_monthorweek as sd

def sort_month(df, column_name):
    return sd.Sort_Dataframeby_Month(df, column_name)
In [20]:
final_prices = sort_month(final_hotel, 'month')
final_prices
Out[20]:
month price_for_resort price_for_city_hotel
0 January 48.761125 82.330983
1 February 54.147478 86.520062
2 March 57.056838 90.658533
3 April 75.867816 111.962267
4 May 76.657558 120.669827
5 June 107.974850 117.874360
6 July 150.122528 115.818019
7 August 181.205892 118.674598
8 September 96.416860 112.776582
9 October 61.775449 102.004672
10 November 48.706289 86.946592
11 December 68.410104 88.401855
In [21]:
plt.figure(figsize = (17, 8))

px.line(final_prices, x = 'month', y = ['price_for_resort','price_for_city_hotel'],
        title = 'Room price per night over the Months', template = 'plotly_dark')
<Figure size 1700x800 with 0 Axes>

Hotel%20booking%20prediction%207.png

In [22]:
resort_guests = data_resort['arrival_date_month'].value_counts().reset_index()
resort_guests.columns=['month','no of guests']
resort_guests
Out[22]:
month no of guests
0 August 3257
1 July 3137
2 October 2575
3 March 2571
4 April 2550
5 May 2535
6 February 2308
7 September 2102
8 June 2037
9 December 2014
10 November 1975
11 January 1866
In [23]:
city_guests = data_city['arrival_date_month'].value_counts().reset_index()
city_guests.columns=['month','no of guests']
city_guests
Out[23]:
month no of guests
0 August 5367
1 July 4770
2 May 4568
3 June 4358
4 October 4326
5 September 4283
6 March 4049
7 April 4010
8 February 3051
9 November 2676
10 December 2377
11 January 2249
In [24]:
final_guests = resort_guests.merge(city_guests,on='month')
final_guests.columns=['month','no of guests in resort','no of guest in city hotel']
final_guests
Out[24]:
month no of guests in resort no of guest in city hotel
0 August 3257 5367
1 July 3137 4770
2 October 2575 4326
3 March 2571 4049
4 April 2550 4010
5 May 2535 4568
6 February 2308 3051
7 September 2102 4283
8 June 2037 4358
9 December 2014 2377
10 November 1975 2676
11 January 1866 2249
In [25]:
final_guests = sort_month(final_guests,'month')
final_guests
Out[25]:
month no of guests in resort no of guest in city hotel
0 January 1866 2249
1 February 2308 3051
2 March 2571 4049
3 April 2550 4010
4 May 2535 4568
5 June 2037 4358
6 July 3137 4770
7 August 3257 5367
8 September 2102 4283
9 October 2575 4326
10 November 1975 2676
11 December 2014 2377
In [26]:
px.line(final_guests, x = 'month', y = ['no of guests in resort','no of guest in city hotel'],
        title='Total no of guests per Months', template = 'plotly_dark')

Hotel%20booking%20prediction%208.png

In [27]:
filter = df['is_canceled'] == 0
data = df[filter]
data.head()
Out[27]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies meal country market_segment distribution_channel is_repeated_guest previous_cancellations previous_bookings_not_canceled reserved_room_type assigned_room_type booking_changes deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct 0 0 0 C C 3 No Deposit 0.0 0.0 0 Transient 0.0 0 0 Check-Out 2015-07-01
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct 0 0 0 C C 4 No Deposit 0.0 0.0 0 Transient 0.0 0 0 Check-Out 2015-07-01
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 0.0 0 BB GBR Direct Direct 0 0 0 A C 0 No Deposit 0.0 0.0 0 Transient 75.0 0 0 Check-Out 2015-07-02
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 0.0 0 BB GBR Corporate Corporate 0 0 0 A A 0 No Deposit 304.0 0.0 0 Transient 75.0 0 0 Check-Out 2015-07-02
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 0.0 0 BB GBR Online TA TA/TO 0 0 0 A A 0 No Deposit 240.0 0.0 0 Transient 98.0 0 1 Check-Out 2015-07-03
In [28]:
data['total_nights'] = data['stays_in_weekend_nights'] + data['stays_in_week_nights']
data.head()
Out[28]:
hotel is_canceled lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies meal country market_segment distribution_channel ... previous_cancellations previous_bookings_not_canceled reserved_room_type assigned_room_type booking_changes deposit_type agent company days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status reservation_status_date total_nights
0 Resort Hotel 0 342 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct ... 0 0 C C 3 No Deposit 0.0 0.0 0 Transient 0.0 0 0 Check-Out 2015-07-01 0
1 Resort Hotel 0 737 2015 July 27 1 0 0 2 0.0 0 BB PRT Direct Direct ... 0 0 C C 4 No Deposit 0.0 0.0 0 Transient 0.0 0 0 Check-Out 2015-07-01 0
2 Resort Hotel 0 7 2015 July 27 1 0 1 1 0.0 0 BB GBR Direct Direct ... 0 0 A C 0 No Deposit 0.0 0.0 0 Transient 75.0 0 0 Check-Out 2015-07-02 1
3 Resort Hotel 0 13 2015 July 27 1 0 1 1 0.0 0 BB GBR Corporate Corporate ... 0 0 A A 0 No Deposit 304.0 0.0 0 Transient 75.0 0 0 Check-Out 2015-07-02 1
4 Resort Hotel 0 14 2015 July 27 1 0 2 2 0.0 0 BB GBR Online TA TA/TO ... 0 0 A A 0 No Deposit 240.0 0.0 0 Transient 98.0 0 1 Check-Out 2015-07-03 2

5 rows × 33 columns

In [29]:
stay = data.groupby(['total_nights', 'hotel']).agg('count').reset_index()
stay = stay.iloc[:, :3]
stay = stay.rename(columns={'is_canceled':'Number of stays'})
stay
Out[29]:
total_nights hotel Number of stays
0 0 City Hotel 251
1 0 Resort Hotel 371
2 1 City Hotel 9155
3 1 Resort Hotel 6579
4 2 City Hotel 10983
... ... ... ...
57 46 Resort Hotel 1
58 48 City Hotel 1
59 56 Resort Hotel 1
60 60 Resort Hotel 1
61 69 Resort Hotel 1

62 rows × 3 columns

In [30]:
px.bar(data_frame = stay, x = 'total_nights', y = 'Number of stays', color = 'hotel', barmode = 'group',
        template = 'plotly_dark')

Hotel%20booking%20prediction%209.png

In [31]:
plt.figure(figsize = (24, 12))

corr = df.corr()
sns.heatmap(corr, annot = True, linewidths = 1)
plt.show()
In [32]:
correlation = df.corr()['is_canceled'].abs().sort_values(ascending = False)
correlation
Out[32]:
is_canceled                       1.000000
lead_time                         0.292876
total_of_special_requests         0.234877
required_car_parking_spaces       0.195701
booking_changes                   0.144832
previous_cancellations            0.110139
is_repeated_guest                 0.083745
company                           0.083594
adults                            0.058182
previous_bookings_not_canceled    0.057365
days_in_waiting_list              0.054301
agent                             0.046770
adr                               0.046492
babies                            0.032569
stays_in_week_nights              0.025542
arrival_date_year                 0.016622
arrival_date_week_number          0.008315
arrival_date_day_of_month         0.005948
children                          0.004851
stays_in_weekend_nights           0.001323
Name: is_canceled, dtype: float64
In [33]:
# dropping columns that are not useful

useless_col = ['days_in_waiting_list', 'arrival_date_year', 'arrival_date_year', 'assigned_room_type', 'booking_changes',
               'reservation_status', 'country', 'days_in_waiting_list']

df.drop(useless_col, axis = 1, inplace = True)
In [34]:
df.head()
Out[34]:
hotel is_canceled lead_time arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies meal market_segment distribution_channel is_repeated_guest previous_cancellations previous_bookings_not_canceled reserved_room_type deposit_type agent company customer_type adr required_car_parking_spaces total_of_special_requests reservation_status_date
0 Resort Hotel 0 342 July 27 1 0 0 2 0.0 0 BB Direct Direct 0 0 0 C No Deposit 0.0 0.0 Transient 0.0 0 0 2015-07-01
1 Resort Hotel 0 737 July 27 1 0 0 2 0.0 0 BB Direct Direct 0 0 0 C No Deposit 0.0 0.0 Transient 0.0 0 0 2015-07-01
2 Resort Hotel 0 7 July 27 1 0 1 1 0.0 0 BB Direct Direct 0 0 0 A No Deposit 0.0 0.0 Transient 75.0 0 0 2015-07-02
3 Resort Hotel 0 13 July 27 1 0 1 1 0.0 0 BB Corporate Corporate 0 0 0 A No Deposit 304.0 0.0 Transient 75.0 0 0 2015-07-02
4 Resort Hotel 0 14 July 27 1 0 2 2 0.0 0 BB Online TA TA/TO 0 0 0 A No Deposit 240.0 0.0 Transient 98.0 0 1 2015-07-03
In [35]:
# creating numerical and categorical dataframes

cat_cols = [col for col in df.columns if df[col].dtype == 'O']
cat_cols
Out[35]:
['hotel',
 'arrival_date_month',
 'meal',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'deposit_type',
 'customer_type',
 'reservation_status_date']
In [36]:
cat_df = df[cat_cols]
cat_df.head()
Out[36]:
hotel arrival_date_month meal market_segment distribution_channel reserved_room_type deposit_type customer_type reservation_status_date
0 Resort Hotel July BB Direct Direct C No Deposit Transient 2015-07-01
1 Resort Hotel July BB Direct Direct C No Deposit Transient 2015-07-01
2 Resort Hotel July BB Direct Direct A No Deposit Transient 2015-07-02
3 Resort Hotel July BB Corporate Corporate A No Deposit Transient 2015-07-02
4 Resort Hotel July BB Online TA TA/TO A No Deposit Transient 2015-07-03
In [37]:
cat_df['reservation_status_date'] = pd.to_datetime(cat_df['reservation_status_date'])

cat_df['year'] = cat_df['reservation_status_date'].dt.year
cat_df['month'] = cat_df['reservation_status_date'].dt.month
cat_df['day'] = cat_df['reservation_status_date'].dt.day
In [38]:
cat_df.drop(['reservation_status_date','arrival_date_month'] , axis = 1, inplace = True)
In [39]:
cat_df.head()
Out[39]:
hotel meal market_segment distribution_channel reserved_room_type deposit_type customer_type year month day
0 Resort Hotel BB Direct Direct C No Deposit Transient 2015 7 1
1 Resort Hotel BB Direct Direct C No Deposit Transient 2015 7 1
2 Resort Hotel BB Direct Direct A No Deposit Transient 2015 7 2
3 Resort Hotel BB Corporate Corporate A No Deposit Transient 2015 7 2
4 Resort Hotel BB Online TA TA/TO A No Deposit Transient 2015 7 3
In [40]:
# printing unique values of each column

for col in cat_df.columns:
    print(f"{col}: \n{cat_df[col].unique()}\n")
hotel: 
['Resort Hotel' 'City Hotel']

meal: 
['BB' 'FB' 'HB' 'SC' 'Undefined']

market_segment: 
['Direct' 'Corporate' 'Online TA' 'Offline TA/TO' 'Complementary' 'Groups'
 'Undefined' 'Aviation']

distribution_channel: 
['Direct' 'Corporate' 'TA/TO' 'Undefined' 'GDS']

reserved_room_type: 
['C' 'A' 'D' 'E' 'G' 'F' 'H' 'L' 'B']

deposit_type: 
['No Deposit' 'Refundable' 'Non Refund']

customer_type: 
['Transient' 'Contract' 'Transient-Party' 'Group']

year: 
[2015 2014 2016 2017]

month: 
[ 7  5  4  6  3  8  9  1 11 10 12  2]

day: 
[ 1  2  3  6 22 23  5  7  8 11 15 16 29 19 18  9 13  4 12 26 17 10 20 14
 30 28 25 21 27 24 31]

In [41]:
# encoding categorical variables

cat_df['hotel'] = cat_df['hotel'].map({'Resort Hotel' : 0, 'City Hotel' : 1})

cat_df['meal'] = cat_df['meal'].map({'BB' : 0, 'FB': 1, 'HB': 2, 'SC': 3, 'Undefined': 4})

cat_df['market_segment'] = cat_df['market_segment'].map({'Direct': 0, 'Corporate': 1, 'Online TA': 2, 'Offline TA/TO': 3,
                                                           'Complementary': 4, 'Groups': 5, 'Undefined': 6, 'Aviation': 7})

cat_df['distribution_channel'] = cat_df['distribution_channel'].map({'Direct': 0, 'Corporate': 1, 'TA/TO': 2, 'Undefined': 3,
                                                                       'GDS': 4})

cat_df['reserved_room_type'] = cat_df['reserved_room_type'].map({'C': 0, 'A': 1, 'D': 2, 'E': 3, 'G': 4, 'F': 5, 'H': 6,
                                                                   'L': 7, 'B': 8})

cat_df['deposit_type'] = cat_df['deposit_type'].map({'No Deposit': 0, 'Refundable': 1, 'Non Refund': 3})

cat_df['customer_type'] = cat_df['customer_type'].map({'Transient': 0, 'Contract': 1, 'Transient-Party': 2, 'Group': 3})

cat_df['year'] = cat_df['year'].map({2015: 0, 2014: 1, 2016: 2, 2017: 3})
In [42]:
cat_df.head()
Out[42]:
hotel meal market_segment distribution_channel reserved_room_type deposit_type customer_type year month day
0 0 0 0 0 0 0 0 0 7 1
1 0 0 0 0 0 0 0 0 7 1
2 0 0 0 0 1 0 0 0 7 2
3 0 0 1 1 1 0 0 0 7 2
4 0 0 2 2 1 0 0 0 7 3
In [43]:
num_df = df.drop(columns = cat_cols, axis = 1)
num_df.drop('is_canceled', axis = 1, inplace = True)
num_df
Out[43]:
lead_time arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies is_repeated_guest previous_cancellations previous_bookings_not_canceled agent company adr required_car_parking_spaces total_of_special_requests
0 342 27 1 0 0 2 0.0 0 0 0 0 0.0 0.0 0.00 0 0
1 737 27 1 0 0 2 0.0 0 0 0 0 0.0 0.0 0.00 0 0
2 7 27 1 0 1 1 0.0 0 0 0 0 0.0 0.0 75.00 0 0
3 13 27 1 0 1 1 0.0 0 0 0 0 304.0 0.0 75.00 0 0
4 14 27 1 0 2 2 0.0 0 0 0 0 240.0 0.0 98.00 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
119385 23 35 30 2 5 2 0.0 0 0 0 0 394.0 0.0 96.14 0 0
119386 102 35 31 2 5 3 0.0 0 0 0 0 9.0 0.0 225.43 0 2
119387 34 35 31 2 5 2 0.0 0 0 0 0 9.0 0.0 157.71 0 4
119388 109 35 31 2 5 2 0.0 0 0 0 0 89.0 0.0 104.40 0 0
119389 205 35 29 2 7 2 0.0 0 0 0 0 9.0 0.0 151.20 0 2

119210 rows × 16 columns

In [44]:
num_df.var()
Out[44]:
lead_time                         11422.361808
arrival_date_week_number            184.990111
arrival_date_day_of_month            77.107192
stays_in_weekend_nights               0.990258
stays_in_week_nights                  3.599010
adults                                0.330838
children                              0.159070
babies                                0.009508
is_repeated_guest                     0.030507
previous_cancellations                0.713887
previous_bookings_not_canceled        2.244415
agent                             11485.169679
company                            2897.684308
adr                                2543.589039
required_car_parking_spaces           0.060201
total_of_special_requests             0.628652
dtype: float64
In [45]:
# normalizing numerical variables

num_df['lead_time'] = np.log(num_df['lead_time'] + 1)
num_df['arrival_date_week_number'] = np.log(num_df['arrival_date_week_number'] + 1)
num_df['arrival_date_day_of_month'] = np.log(num_df['arrival_date_day_of_month'] + 1)
num_df['agent'] = np.log(num_df['agent'] + 1)
num_df['company'] = np.log(num_df['company'] + 1)
num_df['adr'] = np.log(num_df['adr'] + 1)
In [46]:
num_df.var()
Out[46]:
lead_time                         2.582757
arrival_date_week_number          0.440884
arrival_date_day_of_month         0.506325
stays_in_weekend_nights           0.990258
stays_in_week_nights              3.599010
adults                            0.330838
children                          0.159070
babies                            0.009508
is_repeated_guest                 0.030507
previous_cancellations            0.713887
previous_bookings_not_canceled    2.244415
agent                             3.535793
company                           1.346883
adr                               0.515480
required_car_parking_spaces       0.060201
total_of_special_requests         0.628652
dtype: float64
In [47]:
num_df['adr'] = num_df['adr'].fillna(value = num_df['adr'].mean())
In [48]:
num_df.head()
Out[48]:
lead_time arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies is_repeated_guest previous_cancellations previous_bookings_not_canceled agent company adr required_car_parking_spaces total_of_special_requests
0 5.837730 3.332205 0.693147 0 0 2 0.0 0 0 0 0 0.000000 0.0 0.000000 0 0
1 6.603944 3.332205 0.693147 0 0 2 0.0 0 0 0 0 0.000000 0.0 0.000000 0 0
2 2.079442 3.332205 0.693147 0 1 1 0.0 0 0 0 0 0.000000 0.0 4.330733 0 0
3 2.639057 3.332205 0.693147 0 1 1 0.0 0 0 0 0 5.720312 0.0 4.330733 0 0
4 2.708050 3.332205 0.693147 0 2 2 0.0 0 0 0 0 5.484797 0.0 4.595120 0 1
In [49]:
X = pd.concat([cat_df, num_df], axis = 1)
y = df['is_canceled']
In [50]:
X.shape, y.shape
Out[50]:
((119210, 26), (119210,))
In [51]:
# splitting data into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
In [52]:
X_train.head()
Out[52]:
hotel meal market_segment distribution_channel reserved_room_type deposit_type customer_type year month day lead_time arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies is_repeated_guest previous_cancellations previous_bookings_not_canceled agent company adr required_car_parking_spaces total_of_special_requests
81904 1 0 3 2 1 3 0 0 12 18 5.204007 2.995732 1.791759 2 4 2 0.0 0 0 1 0 3.295837 0.0 4.510860 0 0
66526 1 0 2 2 1 0 0 3 1 26 4.521789 2.833213 3.091042 1 2 2 0.0 0 0 0 0 2.302585 0.0 4.844187 0 0
95207 1 0 0 0 1 0 0 2 8 16 2.564949 3.555348 2.772589 1 0 2 0.0 0 0 0 0 2.708050 0.0 5.075174 0 0
8043 0 0 2 2 2 0 0 2 3 8 5.752573 3.637586 2.079442 2 7 2 0.0 0 0 0 0 5.484797 0.0 4.510860 0 1
9332 0 0 2 2 2 0 0 2 7 7 4.941642 3.850148 2.564949 2 5 2 0.0 0 0 0 0 5.484797 0.0 3.867026 0 0
In [53]:
X_test.head()
Out[53]:
hotel meal market_segment distribution_channel reserved_room_type deposit_type customer_type year month day lead_time arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children babies is_repeated_guest previous_cancellations previous_bookings_not_canceled agent company adr required_car_parking_spaces total_of_special_requests
20042 0 0 2 2 1 0 0 2 1 15 0.000000 1.386294 2.708050 0 1 1 0.0 0 0 0 0 5.484797 0.0 3.713572 0 0
24451 0 0 3 2 1 0 1 2 5 22 4.770685 3.091042 2.772589 2 5 2 0.0 0 0 0 0 5.497168 0.0 3.648057 0 0
113817 1 0 2 2 1 0 0 3 6 12 3.555348 3.178054 2.197225 1 3 1 0.0 0 0 0 0 4.454347 0.0 4.691348 0 0
115233 1 3 2 2 1 0 0 3 7 3 1.386294 3.295837 3.401197 1 3 2 0.0 0 0 0 0 2.302585 0.0 5.013963 0 1
22831 0 0 0 0 5 0 0 2 4 8 4.343805 2.772589 1.386294 2 3 2 0.0 0 0 0 0 5.525453 0.0 4.434382 0 1
In [54]:
y_train.head(), y_test.head()
Out[54]:
(81904    1
 66526    1
 95207    0
 8043     1
 9332     1
 Name: is_canceled, dtype: int64,
 20042     0
 24451     0
 113817    0
 115233    0
 22831     0
 Name: is_canceled, dtype: int64)

Hotel%20booking%20prediction%2010.png

In [55]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

acc_lr = accuracy_score(y_test, y_pred_lr)
conf = confusion_matrix(y_test, y_pred_lr)
clf_report = classification_report(y_test, y_pred_lr)

print(f"Accuracy Score of Logistic Regression is : {acc_lr}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Logistic Regression is : 0.8099711992841764
Confusion Matrix : 
[[21253  1307]
 [ 5489  7714]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.79      0.94      0.86     22560
           1       0.86      0.58      0.69     13203

    accuracy                           0.81     35763
   macro avg       0.82      0.76      0.78     35763
weighted avg       0.82      0.81      0.80     35763

Hotel%20booking%20prediction%2011.png

In [56]:
knn = KNeighborsClassifier()

X_train = np.ascontiguousarray(X_train)
X_test = np.ascontiguousarray(X_test)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

acc_knn = accuracy_score(y_test, y_pred_knn)
conf = confusion_matrix(y_test, y_pred_knn)
clf_report = classification_report(y_test, y_pred_knn)

print(f"Accuracy Score of KNN is : {acc_knn}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of KNN is : 0.892906076112183
Confusion Matrix : 
[[21765   795]
 [ 3035 10168]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.88      0.96      0.92     22560
           1       0.93      0.77      0.84     13203

    accuracy                           0.89     35763
   macro avg       0.90      0.87      0.88     35763
weighted avg       0.90      0.89      0.89     35763

Hotel%20booking%20prediction%2012.png

In [57]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

y_pred_dtc = dtc.predict(X_test)

acc_dtc = accuracy_score(y_test, y_pred_dtc)
conf = confusion_matrix(y_test, y_pred_dtc)
clf_report = classification_report(y_test, y_pred_dtc)

print(f"Accuracy Score of Decision Tree is : {acc_dtc}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Decision Tree is : 0.9502558510192098
Confusion Matrix : 
[[21658   902]
 [  877 12326]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     22560
           1       0.93      0.93      0.93     13203

    accuracy                           0.95     35763
   macro avg       0.95      0.95      0.95     35763
weighted avg       0.95      0.95      0.95     35763

Hotel%20booking%20prediction%2013.png

In [58]:
rd_clf = RandomForestClassifier()
rd_clf.fit(X_train, y_train)

y_pred_rd_clf = rd_clf.predict(X_test)

acc_rd_clf = accuracy_score(y_test, y_pred_rd_clf)
conf = confusion_matrix(y_test, y_pred_rd_clf)
clf_report = classification_report(y_test, y_pred_rd_clf)

print(f"Accuracy Score of Random Forest is : {acc_rd_clf}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Random Forest is : 0.9545340156027179
Confusion Matrix : 
[[22396   164]
 [ 1462 11741]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.94      0.99      0.96     22560
           1       0.99      0.89      0.94     13203

    accuracy                           0.95     35763
   macro avg       0.96      0.94      0.95     35763
weighted avg       0.96      0.95      0.95     35763

Hotel%20booking%20prediction%2014.png

In [59]:
ada = AdaBoostClassifier(base_estimator = dtc)
ada.fit(X_train, y_train)

y_pred_ada = ada.predict(X_test)

acc_ada = accuracy_score(y_test, y_pred_ada)
conf = confusion_matrix(y_test, y_pred_ada)
clf_report = classification_report(y_test, y_pred_ada)

print(f"Accuracy Score of Ada Boost Classifier is : {acc_ada}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Ada Boost Classifier is : 0.9499482705589576
Confusion Matrix : 
[[21646   914]
 [  876 12327]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     22560
           1       0.93      0.93      0.93     13203

    accuracy                           0.95     35763
   macro avg       0.95      0.95      0.95     35763
weighted avg       0.95      0.95      0.95     35763

Hotel%20booking%20prediction%2015.png

In [60]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)

acc_gb = accuracy_score(y_test, y_pred_gb)
conf = confusion_matrix(y_test, y_pred_gb)
clf_report = classification_report(y_test, y_pred_gb)

print(f"Accuracy Score of Ada Boost Classifier is : {acc_gb}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Ada Boost Classifier is : 0.9204764700947907
Confusion Matrix : 
[[22440   120]
 [ 2724 10479]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     22560
           1       0.99      0.79      0.88     13203

    accuracy                           0.92     35763
   macro avg       0.94      0.89      0.91     35763
weighted avg       0.93      0.92      0.92     35763

Hotel%20booking%20prediction%2016.png

In [61]:
xgb = XGBClassifier(booster = 'gbtree', learning_rate = 0.1, max_depth = 5, n_estimators = 180)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)

acc_xgb = accuracy_score(y_test, y_pred_xgb)
conf = confusion_matrix(y_test, y_pred_xgb)
clf_report = classification_report(y_test, y_pred_xgb)

print(f"Accuracy Score of Ada Boost Classifier is : {acc_xgb}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Ada Boost Classifier is : 0.9835863881665409
Confusion Matrix : 
[[22542    18]
 [  569 12634]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     22560
           1       1.00      0.96      0.98     13203

    accuracy                           0.98     35763
   macro avg       0.99      0.98      0.98     35763
weighted avg       0.98      0.98      0.98     35763

Hotel%20booking%20prediction%2018.png

In [62]:
etc = ExtraTreesClassifier()
etc.fit(X_train, y_train)

y_pred_etc = etc.predict(X_test)

acc_etc = accuracy_score(y_test, y_pred_etc)
conf = confusion_matrix(y_test, y_pred_etc)
clf_report = classification_report(y_test, y_pred_etc)

print(f"Accuracy Score of Ada Boost Classifier is : {acc_etc}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Ada Boost Classifier is : 0.9523529905209295
Confusion Matrix : 
[[22346   214]
 [ 1490 11713]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.94      0.99      0.96     22560
           1       0.98      0.89      0.93     13203

    accuracy                           0.95     35763
   macro avg       0.96      0.94      0.95     35763
weighted avg       0.95      0.95      0.95     35763

Hotel%20booking%20prediction%2019.png

In [63]:
lgbm = LGBMClassifier(learning_rate = 1)
lgbm.fit(X_train, y_train)

y_pred_lgbm = lgbm.predict(X_test)

acc_lgbm = accuracy_score(y_test, y_pred_lgbm)
conf = confusion_matrix(y_test, y_pred_lgbm)
clf_report = classification_report(y_test, y_pred_lgbm)

print(f"Accuracy Score of Ada Boost Classifier is : {acc_lgbm}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
[LightGBM] [Info] Number of positive: 30996, number of negative: 52451
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014764 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1216
[LightGBM] [Info] Number of data points in the train set: 83447, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.371445 -> initscore=-0.526021
[LightGBM] [Info] Start training from score -0.526021
Accuracy Score of Ada Boost Classifier is : 0.949416995218522
Confusion Matrix : 
[[21763   797]
 [ 1012 12191]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     22560
           1       0.94      0.92      0.93     13203

    accuracy                           0.95     35763
   macro avg       0.95      0.94      0.95     35763
weighted avg       0.95      0.95      0.95     35763

Hotel%20booking%20prediction%2020.png

In [65]:
classifiers = [('Gradient Boosting Classifier', gb), ('XGboost', xgb),  ('Decision Tree', dtc),
               ('Extra Tree', etc), ('Light Gradient', lgbm), ('Random Forest', rd_clf), ('Ada Boost', ada), ('Logistic', lr),
               ('Knn', knn)]
vc = VotingClassifier(estimators = classifiers)
vc.fit(X_train, y_train)
[LightGBM] [Info] Number of positive: 30996, number of negative: 52451
[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007571 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1216
[LightGBM] [Info] Number of data points in the train set: 83447, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.371445 -> initscore=-0.526021
[LightGBM] [Info] Start training from score -0.526021
Out[65]:
VotingClassifier(estimators=[('Gradient Boosting Classifier',
                              GradientBoostingClassifier()),
                             ('XGboost',
                              XGBClassifier(base_score=None, booster='gbtree',
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, gamma=None,
                                            gpu_id=None, grow_polic...
                                            num_parallel_tree=None,
                                            predictor=None, random_state=None, ...)),
                             ('Decision Tree', DecisionTreeClassifier()),
                             ('Extra Tree', ExtraTreesClassifier()),
                             ('Light Gradient',
                              LGBMClassifier(learning_rate=1)),
                             ('Random Forest', RandomForestClassifier()),
                             ('Ada Boost',
                              AdaBoostClassifier(base_estimator=DecisionTreeClassifier())),
                             ('Logistic', LogisticRegression()),
                             ('Knn', KNeighborsClassifier())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
VotingClassifier(estimators=[('Gradient Boosting Classifier',
                              GradientBoostingClassifier()),
                             ('XGboost',
                              XGBClassifier(base_score=None, booster='gbtree',
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            early_stopping_rounds=None,
                                            enable_categorical=False,
                                            eval_metric=None,
                                            feature_types=None, gamma=None,
                                            gpu_id=None, grow_polic...
                                            num_parallel_tree=None,
                                            predictor=None, random_state=None, ...)),
                             ('Decision Tree', DecisionTreeClassifier()),
                             ('Extra Tree', ExtraTreesClassifier()),
                             ('Light Gradient',
                              LGBMClassifier(learning_rate=1)),
                             ('Random Forest', RandomForestClassifier()),
                             ('Ada Boost',
                              AdaBoostClassifier(base_estimator=DecisionTreeClassifier())),
                             ('Logistic', LogisticRegression()),
                             ('Knn', KNeighborsClassifier())])
GradientBoostingClassifier()
XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=180, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)
DecisionTreeClassifier()
ExtraTreesClassifier()
LGBMClassifier(learning_rate=1)
RandomForestClassifier()
DecisionTreeClassifier()
DecisionTreeClassifier()
LogisticRegression()
KNeighborsClassifier()
In [66]:
y_pred_vc = vc.predict(X_test)

acc_vtc = accuracy_score(y_test, y_pred_vc)
conf = confusion_matrix(y_test, y_pred_vc)
clf_report = classification_report(y_test, y_pred_vc)

print(f"Accuracy Score of Ada Boost Classifier is : {acc_vtc}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Ada Boost Classifier is : 0.9652434079914995
Confusion Matrix : 
[[22530    30]
 [ 1213 11990]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     22560
           1       1.00      0.91      0.95     13203

    accuracy                           0.97     35763
   macro avg       0.97      0.95      0.96     35763
weighted avg       0.97      0.97      0.96     35763

Hotel%20booking%20prediction%2021.png

In [67]:
from tensorflow.keras.utils import to_categorical

X = pd.concat([cat_df, num_df], axis = 1)
y = to_categorical(df['is_canceled'])
In [68]:
# splitting data into training set and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
In [69]:
import keras
from keras.layers import Dense
from keras.models import Sequential

model  = Sequential()
model.add(Dense(100, activation = 'relu', input_shape = (26, )))
model.add(Dense(100, activation = 'relu'))
model.add(Dense(2, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model_history = model.fit(X_train, y_train, validation_data = (X_test, y_test),
                          epochs = 100)
Epoch 1/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.3595 - accuracy: 0.8476 - val_loss: 0.2459 - val_accuracy: 0.9108
Epoch 2/100
2608/2608 [==============================] - 12s 4ms/step - loss: 0.1780 - accuracy: 0.9387 - val_loss: 0.1419 - val_accuracy: 0.9515
Epoch 3/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.1277 - accuracy: 0.9583 - val_loss: 0.1103 - val_accuracy: 0.9634
Epoch 4/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.1046 - accuracy: 0.9657 - val_loss: 0.0831 - val_accuracy: 0.9748
Epoch 5/100
2608/2608 [==============================] - 12s 4ms/step - loss: 0.0914 - accuracy: 0.9704 - val_loss: 0.0819 - val_accuracy: 0.9767
Epoch 6/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0801 - accuracy: 0.9748 - val_loss: 0.0772 - val_accuracy: 0.9779
Epoch 7/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0733 - accuracy: 0.9769 - val_loss: 0.0711 - val_accuracy: 0.9771
Epoch 8/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0672 - accuracy: 0.9787 - val_loss: 0.0631 - val_accuracy: 0.9824
Epoch 9/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0633 - accuracy: 0.9801 - val_loss: 0.0795 - val_accuracy: 0.9760
Epoch 10/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0584 - accuracy: 0.9819 - val_loss: 0.0568 - val_accuracy: 0.9837
Epoch 11/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0560 - accuracy: 0.9822 - val_loss: 0.0529 - val_accuracy: 0.9839
Epoch 12/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0531 - accuracy: 0.9828 - val_loss: 0.0456 - val_accuracy: 0.9875
Epoch 13/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0517 - accuracy: 0.9839 - val_loss: 0.0613 - val_accuracy: 0.9805
Epoch 14/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0518 - accuracy: 0.9834 - val_loss: 0.0491 - val_accuracy: 0.9874
Epoch 15/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0468 - accuracy: 0.9849 - val_loss: 0.0476 - val_accuracy: 0.9859
Epoch 16/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0470 - accuracy: 0.9849 - val_loss: 0.0538 - val_accuracy: 0.9829
Epoch 17/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0444 - accuracy: 0.9861 - val_loss: 0.0732 - val_accuracy: 0.9803
Epoch 18/100
2608/2608 [==============================] - 12s 4ms/step - loss: 0.0428 - accuracy: 0.9861 - val_loss: 0.0414 - val_accuracy: 0.9882
Epoch 19/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0440 - accuracy: 0.9860 - val_loss: 0.0365 - val_accuracy: 0.9906
Epoch 20/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0419 - accuracy: 0.9868 - val_loss: 0.0520 - val_accuracy: 0.9846
Epoch 21/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0406 - accuracy: 0.9872 - val_loss: 0.0583 - val_accuracy: 0.9823
Epoch 22/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0395 - accuracy: 0.9876 - val_loss: 0.0604 - val_accuracy: 0.9817
Epoch 23/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0390 - accuracy: 0.9878 - val_loss: 0.0429 - val_accuracy: 0.9874
Epoch 24/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0391 - accuracy: 0.9872 - val_loss: 0.0356 - val_accuracy: 0.9902
Epoch 25/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0381 - accuracy: 0.9879 - val_loss: 0.0446 - val_accuracy: 0.9873
Epoch 26/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0351 - accuracy: 0.9890 - val_loss: 0.0320 - val_accuracy: 0.9904
Epoch 27/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0368 - accuracy: 0.9884 - val_loss: 0.0451 - val_accuracy: 0.9867
Epoch 28/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0374 - accuracy: 0.9882 - val_loss: 0.0335 - val_accuracy: 0.9908
Epoch 29/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0335 - accuracy: 0.9894 - val_loss: 0.0378 - val_accuracy: 0.9891
Epoch 30/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0329 - accuracy: 0.9897 - val_loss: 0.0399 - val_accuracy: 0.9889
Epoch 31/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0346 - accuracy: 0.9894 - val_loss: 0.0361 - val_accuracy: 0.9890
Epoch 32/100
2608/2608 [==============================] - 14s 5ms/step - loss: 0.0321 - accuracy: 0.9900 - val_loss: 0.0338 - val_accuracy: 0.9897
Epoch 33/100
2608/2608 [==============================] - 15s 6ms/step - loss: 0.0318 - accuracy: 0.9896 - val_loss: 0.0348 - val_accuracy: 0.9907
Epoch 34/100
2608/2608 [==============================] - 15s 6ms/step - loss: 0.0343 - accuracy: 0.9889 - val_loss: 0.0779 - val_accuracy: 0.9767
Epoch 35/100
2608/2608 [==============================] - 15s 6ms/step - loss: 0.0297 - accuracy: 0.9907 - val_loss: 0.0434 - val_accuracy: 0.9878
Epoch 36/100
2608/2608 [==============================] - 14s 5ms/step - loss: 0.0303 - accuracy: 0.9902 - val_loss: 0.0368 - val_accuracy: 0.9905
Epoch 37/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0300 - accuracy: 0.9906 - val_loss: 0.0408 - val_accuracy: 0.9898
Epoch 38/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0296 - accuracy: 0.9904 - val_loss: 0.0616 - val_accuracy: 0.9832
Epoch 39/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0309 - accuracy: 0.9903 - val_loss: 0.0369 - val_accuracy: 0.9912
Epoch 40/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0296 - accuracy: 0.9904 - val_loss: 0.0473 - val_accuracy: 0.9878
Epoch 41/100
2608/2608 [==============================] - 11s 4ms/step - loss: 0.0325 - accuracy: 0.9900 - val_loss: 0.0303 - val_accuracy: 0.9916
Epoch 42/100
2608/2608 [==============================] - 11s 4ms/step - loss: 0.0297 - accuracy: 0.9907 - val_loss: 0.0637 - val_accuracy: 0.9836
Epoch 43/100
2608/2608 [==============================] - 12s 4ms/step - loss: 0.0288 - accuracy: 0.9910 - val_loss: 0.0364 - val_accuracy: 0.9892
Epoch 44/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0264 - accuracy: 0.9912 - val_loss: 0.0350 - val_accuracy: 0.9897
Epoch 45/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0283 - accuracy: 0.9910 - val_loss: 0.0374 - val_accuracy: 0.9886
Epoch 46/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0291 - accuracy: 0.9912 - val_loss: 0.0534 - val_accuracy: 0.9854
Epoch 47/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0263 - accuracy: 0.9915 - val_loss: 0.0316 - val_accuracy: 0.9907
Epoch 48/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0280 - accuracy: 0.9913 - val_loss: 0.0384 - val_accuracy: 0.9894
Epoch 49/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0259 - accuracy: 0.9914 - val_loss: 0.0400 - val_accuracy: 0.9888
Epoch 50/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0277 - accuracy: 0.9910 - val_loss: 0.0425 - val_accuracy: 0.9884
Epoch 51/100
2608/2608 [==============================] - 11s 4ms/step - loss: 0.0277 - accuracy: 0.9916 - val_loss: 0.0290 - val_accuracy: 0.9912
Epoch 52/100
2608/2608 [==============================] - 12s 4ms/step - loss: 0.0244 - accuracy: 0.9922 - val_loss: 0.0521 - val_accuracy: 0.9874
Epoch 53/100
2608/2608 [==============================] - 11s 4ms/step - loss: 0.0268 - accuracy: 0.9913 - val_loss: 0.0257 - val_accuracy: 0.9925
Epoch 54/100
2608/2608 [==============================] - 11s 4ms/step - loss: 0.0262 - accuracy: 0.9916 - val_loss: 0.0487 - val_accuracy: 0.9876
Epoch 55/100
2608/2608 [==============================] - 11s 4ms/step - loss: 0.0259 - accuracy: 0.9917 - val_loss: 0.0332 - val_accuracy: 0.9897
Epoch 56/100
2608/2608 [==============================] - 11s 4ms/step - loss: 0.0240 - accuracy: 0.9922 - val_loss: 0.0325 - val_accuracy: 0.9909
Epoch 57/100
2608/2608 [==============================] - 11s 4ms/step - loss: 0.0256 - accuracy: 0.9914 - val_loss: 0.0364 - val_accuracy: 0.9898
Epoch 58/100
2608/2608 [==============================] - 11s 4ms/step - loss: 0.0252 - accuracy: 0.9924 - val_loss: 0.0329 - val_accuracy: 0.9907
Epoch 59/100
2608/2608 [==============================] - 12s 4ms/step - loss: 0.0254 - accuracy: 0.9920 - val_loss: 0.0789 - val_accuracy: 0.9861
Epoch 60/100
2608/2608 [==============================] - 11s 4ms/step - loss: 0.0237 - accuracy: 0.9926 - val_loss: 0.0432 - val_accuracy: 0.9887
Epoch 61/100
2608/2608 [==============================] - 11s 4ms/step - loss: 0.0239 - accuracy: 0.9922 - val_loss: 0.0275 - val_accuracy: 0.9922
Epoch 62/100
2608/2608 [==============================] - 12s 4ms/step - loss: 0.0226 - accuracy: 0.9927 - val_loss: 0.0472 - val_accuracy: 0.9892
Epoch 63/100
2608/2608 [==============================] - 11s 4ms/step - loss: 0.0222 - accuracy: 0.9927 - val_loss: 0.0302 - val_accuracy: 0.9914
Epoch 64/100
2608/2608 [==============================] - 12s 4ms/step - loss: 0.0242 - accuracy: 0.9919 - val_loss: 0.0349 - val_accuracy: 0.9907
Epoch 65/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0238 - accuracy: 0.9924 - val_loss: 0.0312 - val_accuracy: 0.9917
Epoch 66/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0220 - accuracy: 0.9928 - val_loss: 0.0350 - val_accuracy: 0.9901
Epoch 67/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0247 - accuracy: 0.9919 - val_loss: 0.0398 - val_accuracy: 0.9905
Epoch 68/100
2608/2608 [==============================] - 14s 5ms/step - loss: 0.0214 - accuracy: 0.9932 - val_loss: 0.0397 - val_accuracy: 0.9897
Epoch 69/100
2608/2608 [==============================] - 14s 5ms/step - loss: 0.0221 - accuracy: 0.9929 - val_loss: 0.0346 - val_accuracy: 0.9908
Epoch 70/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0233 - accuracy: 0.9928 - val_loss: 0.0336 - val_accuracy: 0.9910
Epoch 71/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0221 - accuracy: 0.9931 - val_loss: 0.0376 - val_accuracy: 0.9902
Epoch 72/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0213 - accuracy: 0.9931 - val_loss: 0.0384 - val_accuracy: 0.9914
Epoch 73/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0223 - accuracy: 0.9928 - val_loss: 0.0562 - val_accuracy: 0.9808
Epoch 74/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0212 - accuracy: 0.9929 - val_loss: 0.0392 - val_accuracy: 0.9876
Epoch 75/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0229 - accuracy: 0.9924 - val_loss: 0.0337 - val_accuracy: 0.9897
Epoch 76/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0208 - accuracy: 0.9933 - val_loss: 0.0476 - val_accuracy: 0.9891
Epoch 77/100
2608/2608 [==============================] - 14s 5ms/step - loss: 0.0203 - accuracy: 0.9933 - val_loss: 0.0347 - val_accuracy: 0.9916
Epoch 78/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0208 - accuracy: 0.9930 - val_loss: 0.0302 - val_accuracy: 0.9919
Epoch 79/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0205 - accuracy: 0.9933 - val_loss: 0.0360 - val_accuracy: 0.9909
Epoch 80/100
2608/2608 [==============================] - 14s 5ms/step - loss: 0.0205 - accuracy: 0.9935 - val_loss: 0.0280 - val_accuracy: 0.9925
Epoch 81/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0203 - accuracy: 0.9933 - val_loss: 0.0326 - val_accuracy: 0.9917
Epoch 82/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0199 - accuracy: 0.9936 - val_loss: 0.0434 - val_accuracy: 0.9893
Epoch 83/100
2608/2608 [==============================] - 14s 5ms/step - loss: 0.0194 - accuracy: 0.9938 - val_loss: 0.0388 - val_accuracy: 0.9910
Epoch 84/100
2608/2608 [==============================] - 14s 5ms/step - loss: 0.0184 - accuracy: 0.9942 - val_loss: 0.0298 - val_accuracy: 0.9918
Epoch 85/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0212 - accuracy: 0.9930 - val_loss: 0.0404 - val_accuracy: 0.9893
Epoch 86/100
2608/2608 [==============================] - 14s 5ms/step - loss: 0.0213 - accuracy: 0.9930 - val_loss: 0.0285 - val_accuracy: 0.9926
Epoch 87/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0204 - accuracy: 0.9935 - val_loss: 0.0318 - val_accuracy: 0.9927
Epoch 88/100
2608/2608 [==============================] - 12s 5ms/step - loss: 0.0186 - accuracy: 0.9941 - val_loss: 0.0331 - val_accuracy: 0.9920
Epoch 89/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0208 - accuracy: 0.9935 - val_loss: 0.0353 - val_accuracy: 0.9904
Epoch 90/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0193 - accuracy: 0.9936 - val_loss: 0.0427 - val_accuracy: 0.9883
Epoch 91/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0180 - accuracy: 0.9940 - val_loss: 0.0390 - val_accuracy: 0.9911
Epoch 92/100
2608/2608 [==============================] - 14s 5ms/step - loss: 0.0191 - accuracy: 0.9938 - val_loss: 0.0324 - val_accuracy: 0.9912
Epoch 93/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0173 - accuracy: 0.9942 - val_loss: 0.0498 - val_accuracy: 0.9862
Epoch 94/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0180 - accuracy: 0.9941 - val_loss: 0.0505 - val_accuracy: 0.9867
Epoch 95/100
2608/2608 [==============================] - 14s 5ms/step - loss: 0.0179 - accuracy: 0.9941 - val_loss: 0.0710 - val_accuracy: 0.9863
Epoch 96/100
2608/2608 [==============================] - 14s 5ms/step - loss: 0.0188 - accuracy: 0.9939 - val_loss: 0.0368 - val_accuracy: 0.9917
Epoch 97/100
2608/2608 [==============================] - 14s 5ms/step - loss: 0.0172 - accuracy: 0.9943 - val_loss: 0.0353 - val_accuracy: 0.9918
Epoch 98/100
2608/2608 [==============================] - 15s 6ms/step - loss: 0.0189 - accuracy: 0.9936 - val_loss: 0.0377 - val_accuracy: 0.9920
Epoch 99/100
2608/2608 [==============================] - 14s 5ms/step - loss: 0.0170 - accuracy: 0.9941 - val_loss: 0.0306 - val_accuracy: 0.9921
Epoch 100/100
2608/2608 [==============================] - 13s 5ms/step - loss: 0.0185 - accuracy: 0.9941 - val_loss: 0.0370 - val_accuracy: 0.9923
In [70]:
plt.figure(figsize = (12, 6))

train_loss = model_history.history['loss']
val_loss = model_history.history['val_loss'] 
epoch = range(1, 101)

loss = pd.DataFrame({'train_loss' : train_loss, 'val_loss' : val_loss})

px.line(data_frame = loss, x = epoch, y = ['val_loss', 'train_loss'], title = 'Training and Validation Loss',
        template = 'plotly_dark')
<Figure size 1200x600 with 0 Axes>
In [71]:
plt.figure(figsize = (12, 6))

train_acc = model_history.history['accuracy']
val_acc = model_history.history['val_accuracy'] 
epoch = range(1, 101)


accuracy = pd.DataFrame({'train_acc' : train_acc, 'val_acc' : val_acc})

px.line(data_frame = accuracy, x = epoch, y = ['val_acc', 'train_acc'], title = 'Training and Validation Accuracy',
        template = 'plotly_dark')
<Figure size 1200x600 with 0 Axes>
In [72]:
acc_ann = model.evaluate(X_test, y_test)[1]

print(f'Accuracy of model is {acc_ann}')
1118/1118 [==============================] - 3s 3ms/step - loss: 0.0370 - accuracy: 0.9923
Accuracy of model is 0.9923384785652161

Hotel%20booking%20prediction%2022.png

In [73]:
models = pd.DataFrame({
    'Model' : ['Logistic Regression', 'KNN', 'Decision Tree Classifier', 'Random Forest Classifier','Ada Boost Classifier',
             'Gradient Boosting Classifier', 'XgBoost', 'Extra Trees Classifier', 'LGBM', 'Voting Classifier'
               , 'ANN'],
    'Score' : [acc_lr, acc_knn, acc_dtc, acc_rd_clf, acc_ada, acc_gb, acc_xgb, acc_etc, acc_lgbm, acc_vtc, acc_ann]
})


models.sort_values(by = 'Score', ascending = False)
Out[73]:
Model Score
10 ANN 0.992338
6 XgBoost 0.983586
9 Voting Classifier 0.965243
3 Random Forest Classifier 0.954534
7 Extra Trees Classifier 0.952353
2 Decision Tree Classifier 0.950256
4 Ada Boost Classifier 0.949948
8 LGBM 0.949417
5 Gradient Boosting Classifier 0.920476
1 KNN 0.892906
0 Logistic Regression 0.809971
In [74]:
px.bar(data_frame = models, x = 'Score', y = 'Model', color = 'Score', template = 'plotly_dark', title = 'Models Comparison')

Obtained an accuracy of 99% using ANN¶